library(janitor)
package ‘janitor’ was built under R version 3.6.2replacing previous import ‘vctrs::data_frame’ by ‘tibble::data_frame’ when loading ‘dplyr’
Attaching package: ‘janitor’

The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test
library(fastDummies)
library(broom)
package ‘broom’ was built under R version 3.6.2
library(rpart)
library(rpart.plot)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ─────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2     ✓ purrr   0.3.4
✓ tibble  3.0.3     ✓ dplyr   1.0.0
✓ tidyr   1.1.0     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
package ‘ggplot2’ was built under R version 3.6.2package ‘tibble’ was built under R version 3.6.2package ‘tidyr’ was built under R version 3.6.2package ‘purrr’ was built under R version 3.6.2package ‘dplyr’ was built under R version 3.6.2── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(factoextra)
package ‘factoextra’ was built under R version 3.6.2Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
customer_data <- read_csv("data/mall_customers.csv")
Parsed with column specification:
cols(
  CustomerID = col_double(),
  Gender = col_character(),
  Age = col_double(),
  `Annual Income (k$)` = col_double(),
  `Spending Score (1-100)` = col_double()
)
summary(customer_data)

find the data to be quite straight forward and should be reaonable to use for clustering purposes as there is only a few variables, there is no NA’s either

We are interested in creating a marketing campaign to target customers based on their spending score and annual income. Perform a k-means clustering to find if there are meaningful clusters in the data to target the customers.

ggplot(customer_data, aes(x = spending_score_1_100, y = annual_income_k)) +
  geom_point()

Perform k-means clustering and chose a value of k.

customer <- customer_data %>%
  select(spending_score_1_100, annual_income_k)

customer
summary(customer)
 spending_score_1_100 annual_income_k 
 Min.   : 1.00        Min.   : 15.00  
 1st Qu.:34.75        1st Qu.: 41.50  
 Median :50.00        Median : 61.50  
 Mean   :50.20        Mean   : 60.56  
 3rd Qu.:73.00        3rd Qu.: 78.00  
 Max.   :99.00        Max.   :137.00  
customer_scale <- customer %>%
  mutate_all(scale)

customer_scale
clustered_customers <- kmeans(customer_scale, centers = 6, nstart = 25)
clustered_customers
K-means clustering with 6 clusters of sizes 10, 81, 35, 22, 23, 29

Cluster means:
  spending_score_1_100 annual_income_k
1           1.23143545       1.8709508
2          -0.02638995      -0.2004097
3          -1.28122394       1.0523622
4           1.12934389      -1.3262173
5          -1.13411939      -1.3042458
6           1.23811207       0.6850149

Clustering vector:
  [1] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
 [42] 4 5 2 5 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [83] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[124] 6 3 6 2 6 3 6 3 6 2 6 3 6 3 6 3 6 3 6 2 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6
[165] 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1

Within cluster sum of squares by cluster:
[1]  3.681858 14.485632 18.304646  5.217630  7.577407  5.514889
 (between_SS / total_SS =  86.2 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
library(broom)
glance(clustered_customers)
augment(clustered_customers, customer)
library(animation)

customer_scale %>% 
  kmeans.ani(centers = 6)

Visualise the clustering for your chosen value of k.

# Set min & max number of clusters want to look at 
max_k <- 20 

k_clusters <- tibble(k = 1:max_k) %>%
  mutate(
    kclust = map(k, ~ kmeans(customer_scale, .x, nstart = 25)), 
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, customer)
  )

k_clusters
NA
clusterings <- k_clusters %>%
  unnest(glanced)

clusterings
ggplot(clusterings, aes(x=k, y=tot.withinss)) +
  geom_point() +
    geom_line() +
    scale_x_continuous(breaks = seq(1, 20, by = 1))

fviz_nbclust(customer_scale, kmeans, method = "wss", nstart = 25)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

fviz_nbclust(customer_scale, kmeans, method = "silhouette", nstart = 25)

fviz_nbclust(customer_scale, kmeans, method = "gap_stat", nstart = 25, k.max = 10)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100)  [one "." per sample]:
.................................................. 50 
.................................................. 100 

 clusterings %>% 
  unnest(cols = c(augmented)) %>%
  filter(k <= 5) %>%
 ggplot(aes(x = spending_score_1_100, y = annual_income_k)) +
  geom_point(aes(color = .cluster)) + 
  facet_wrap(~ k)

 clusterings %>% 
  unnest(cols = c(augmented)) %>%
  filter(k == 5) %>%
 ggplot(aes(x = spending_score_1_100, y = annual_income_k, colour = .cluster)) +
  geom_point(aes(color = .cluster)) 

 clusterings %>% 
  unnest(augmented) %>%
  filter(k == 5) %>%
  group_by(.cluster) %>%
  summarise(mean(spending_score_1_100), mean(annual_income_k))
`summarise()` ungrouping output (override with `.groups` argument)

Do you think the clustering seems a good fit for this data?

Yes this clustering seems good for this data, at a k means = 5, a cluster of 5 seems to fit well.

Comment on the attributes on one or two of the clusters (maybe even give them a label if you like - like in section 4.1 of the ‘Segmentation & clustering intro’ lesson).

The clusters shouw quite a close spread of the clustered data the purple cluster is very close clsutered.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkoamFuaXRvcikKbGlicmFyeShmYXN0RHVtbWllcykKbGlicmFyeShicm9vbSkKbGlicmFyeShycGFydCkKbGlicmFyeShycGFydC5wbG90KQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShmYWN0b2V4dHJhKQpgYGAKCmBgYHtyfQpjdXN0b21lcl9kYXRhIDwtIHJlYWRfY3N2KCJkYXRhL21hbGxfY3VzdG9tZXJzLmNzdiIpCmBgYAoKYGBge3J9CmN1c3RvbWVyX2RhdGEgPC0gY3VzdG9tZXJfZGF0YSAlPiUKICBjbGVhbl9uYW1lcygpCmBgYAoKYGBge3J9CnN1bW1hcnkoY3VzdG9tZXJfZGF0YSkKYGBgCgojIGZpbmQgdGhlIGRhdGEgdG8gYmUgcXVpdGUgc3RyYWlnaHQgZm9yd2FyZCBhbmQgc2hvdWxkIGJlIHJlYW9uYWJsZSB0byB1c2UgZm9yIGNsdXN0ZXJpbmcgcHVycG9zZXMgYXMgdGhlcmUgaXMgb25seSBhIGZldyB2YXJpYWJsZXMsIHRoZXJlIGlzIG5vIE5BJ3MgZWl0aGVyCgpXZSBhcmUgaW50ZXJlc3RlZCBpbiBjcmVhdGluZyBhIG1hcmtldGluZyBjYW1wYWlnbiB0byB0YXJnZXQgY3VzdG9tZXJzIGJhc2VkIG9uIHRoZWlyIHNwZW5kaW5nIHNjb3JlIGFuZCBhbm51YWwgaW5jb21lLiBQZXJmb3JtIGEgay1tZWFucyBjbHVzdGVyaW5nIHRvIGZpbmQgaWYgdGhlcmUgYXJlIG1lYW5pbmdmdWwgY2x1c3RlcnMgaW4gdGhlIGRhdGEgdG8gdGFyZ2V0IHRoZSBjdXN0b21lcnMuCgpgYGB7cn0KZ2dwbG90KGN1c3RvbWVyX2RhdGEsIGFlcyh4ID0gc3BlbmRpbmdfc2NvcmVfMV8xMDAsIHkgPSBhbm51YWxfaW5jb21lX2spKSArCiAgZ2VvbV9wb2ludCgpCmBgYAoKClBlcmZvcm0gay1tZWFucyBjbHVzdGVyaW5nIGFuZCBjaG9zZSBhIHZhbHVlIG9mIGsuCgpgYGB7cn0KY3VzdG9tZXIgPC0gY3VzdG9tZXJfZGF0YSAlPiUKICBzZWxlY3Qoc3BlbmRpbmdfc2NvcmVfMV8xMDAsIGFubnVhbF9pbmNvbWVfaykKCmN1c3RvbWVyCmBgYAoKYGBge3J9CnN1bW1hcnkoY3VzdG9tZXIpCmBgYAoKYGBge3J9CmN1c3RvbWVyX3NjYWxlIDwtIGN1c3RvbWVyICU+JQogIG11dGF0ZV9hbGwoc2NhbGUpCgpjdXN0b21lcl9zY2FsZQpgYGAKCmBgYHtyfQpjbHVzdGVyZWRfY3VzdG9tZXJzIDwtIGttZWFucyhjdXN0b21lcl9zY2FsZSwgY2VudGVycyA9IDYsIG5zdGFydCA9IDI1KQpjbHVzdGVyZWRfY3VzdG9tZXJzCmBgYAoKYGBge3J9CmxpYnJhcnkoYnJvb20pCmBgYAoKYGBge3J9CnRpZHkoY2x1c3RlcmVkX2N1c3RvbWVycykKCmBgYAoKYGBge3J9CmdsYW5jZShjbHVzdGVyZWRfY3VzdG9tZXJzKQpgYGAKCgpgYGB7cn0KYXVnbWVudChjbHVzdGVyZWRfY3VzdG9tZXJzLCBjdXN0b21lcikKYGBgCgpgYGB7cn0KbGlicmFyeShhbmltYXRpb24pCgpjdXN0b21lcl9zY2FsZSAlPiUgCiAga21lYW5zLmFuaShjZW50ZXJzID0gNikKYGBgCgoKVmlzdWFsaXNlIHRoZSBjbHVzdGVyaW5nIGZvciB5b3VyIGNob3NlbiB2YWx1ZSBvZiBrLgoKYGBge3J9CiMgU2V0IG1pbiAmIG1heCBudW1iZXIgb2YgY2x1c3RlcnMgd2FudCB0byBsb29rIGF0IAptYXhfayA8LSAyMCAKCmtfY2x1c3RlcnMgPC0gdGliYmxlKGsgPSAxOm1heF9rKSAlPiUKICBtdXRhdGUoCiAgICBrY2x1c3QgPSBtYXAoaywgfiBrbWVhbnMoY3VzdG9tZXJfc2NhbGUsIC54LCBuc3RhcnQgPSAyNSkpLCAKICAgIHRpZGllZCA9IG1hcChrY2x1c3QsIHRpZHkpLAogICAgZ2xhbmNlZCA9IG1hcChrY2x1c3QsIGdsYW5jZSksCiAgICBhdWdtZW50ZWQgPSBtYXAoa2NsdXN0LCBhdWdtZW50LCBjdXN0b21lcikKICApCgprX2NsdXN0ZXJzCgpgYGAKCmBgYHtyfQpjbHVzdGVyaW5ncyA8LSBrX2NsdXN0ZXJzICU+JQogIHVubmVzdChnbGFuY2VkKQoKY2x1c3RlcmluZ3MKYGBgCgpgYGB7cn0KZ2dwbG90KGNsdXN0ZXJpbmdzLCBhZXMoeD1rLCB5PXRvdC53aXRoaW5zcykpICsKICBnZW9tX3BvaW50KCkgKwogICAgZ2VvbV9saW5lKCkgKwogICAgc2NhbGVfeF9jb250aW51b3VzKGJyZWFrcyA9IHNlcSgxLCAyMCwgYnkgPSAxKSkKYGBgCgpgYGB7cn0KZnZpel9uYmNsdXN0KGN1c3RvbWVyX3NjYWxlLCBrbWVhbnMsIG1ldGhvZCA9ICJ3c3MiLCBuc3RhcnQgPSAyNSkKYGBgCgpgYGB7cn0KZnZpel9uYmNsdXN0KGN1c3RvbWVyX3NjYWxlLCBrbWVhbnMsIG1ldGhvZCA9ICJzaWxob3VldHRlIiwgbnN0YXJ0ID0gMjUpCmBgYAoKYGBge3J9CmZ2aXpfbmJjbHVzdChjdXN0b21lcl9zY2FsZSwga21lYW5zLCBtZXRob2QgPSAiZ2FwX3N0YXQiLCBuc3RhcnQgPSAyNSwgay5tYXggPSAxMCkKYGBgCgpgYGB7cn0KIGNsdXN0ZXJpbmdzICU+JSAKICB1bm5lc3QoY29scyA9IGMoYXVnbWVudGVkKSkgJT4lCiAgZmlsdGVyKGsgPD0gNSkgJT4lCiBnZ3Bsb3QoYWVzKHggPSBzcGVuZGluZ19zY29yZV8xXzEwMCwgeSA9IGFubnVhbF9pbmNvbWVfaykpICsKICBnZW9tX3BvaW50KGFlcyhjb2xvciA9IC5jbHVzdGVyKSkgKyAKICBmYWNldF93cmFwKH4gaykKYGBgCgpgYGB7cn0KIGNsdXN0ZXJpbmdzICU+JSAKICB1bm5lc3QoY29scyA9IGMoYXVnbWVudGVkKSkgJT4lCiAgZmlsdGVyKGsgPT0gNSkgJT4lCiBnZ3Bsb3QoYWVzKHggPSBzcGVuZGluZ19zY29yZV8xXzEwMCwgeSA9IGFubnVhbF9pbmNvbWVfaywgY29sb3VyID0gLmNsdXN0ZXIpKSArCiAgZ2VvbV9wb2ludChhZXMoY29sb3IgPSAuY2x1c3RlcikpIApgYGAKCmBgYHtyfQogY2x1c3RlcmluZ3MgJT4lIAogIHVubmVzdChhdWdtZW50ZWQpICU+JQogIGZpbHRlcihrID09IDUpICU+JQogIGdyb3VwX2J5KC5jbHVzdGVyKSAlPiUKICBzdW1tYXJpc2UobWVhbihzcGVuZGluZ19zY29yZV8xXzEwMCksIG1lYW4oYW5udWFsX2luY29tZV9rKSkKYGBgCgoKRG8geW91IHRoaW5rIHRoZSBjbHVzdGVyaW5nIHNlZW1zIGEgZ29vZCBmaXQgZm9yIHRoaXMgZGF0YT8KCiMgWWVzIHRoaXMgY2x1c3RlcmluZyBzZWVtcyBnb29kIGZvciB0aGlzIGRhdGEsIGF0IGEgayBtZWFucyA9IDUsIGEgY2x1c3RlciBvZiA1IHNlZW1zIHRvIGZpdCB3ZWxsLgoKQ29tbWVudCBvbiB0aGUgYXR0cmlidXRlcyBvbiBvbmUgb3IgdHdvIG9mIHRoZSBjbHVzdGVycyAobWF5YmUgZXZlbiBnaXZlIHRoZW0gYSBsYWJlbCBpZiB5b3UgbGlrZSAtIGxpa2UgaW4gc2VjdGlvbiA0LjEgb2YgdGhlIOKAmFNlZ21lbnRhdGlvbiAmIGNsdXN0ZXJpbmcgaW50cm/igJkgbGVzc29uKS4KCiMgVGhlIGNsdXN0ZXJzIHNob3V3IHF1aXRlIGEgY2xvc2Ugc3ByZWFkIG9mIHRoZSBjbHVzdGVyZWQgZGF0YSB0aGUgcHVycGxlIGNsdXN0ZXIgaXMgdmVyeSBjbG9zZSBjbHN1dGVyZWQu